gvc_agora_opentargets

Author

Edoardo “Dado” Marcora

Published

December 12, 2024

Setup environment

library(tidyverse)
library(ggformula)
library(janitor)
library(broom)
library(readxl)
library(jsonlite)

library(gprofiler2)

library(mclust)

theme_set(theme_bw())

set.seed(666)

Read and prep data

GVC genes (within 1Mb flanking regions of GVC loci) [OLD]

gvc <-
  read_xlsx("GVC_1Mb_comparison_050224.xlsx") %>%
  clean_names() %>% 
  separate(gene_id, c("ensembl_gene_id", "version")) %>%
  select(-version, -agora_nominated_list, -opentarget_info)

gvc
gvc.genes <-
  gvc %>%
  arrange(absolute_distance) %>%
  distinct(ensembl_gene_id, .keep_all = TRUE) %>%
  select(ensembl_gene_id, gene_symbol, absolute_distance) %>%
  arrange(gene_symbol)

gvc.genes
gvc.genes %>% distinct(gene_symbol) %>% nrow()
[1] 1344

GVC genes (within 1Mb flanking regions of GVC loci) minus APOE and HLA loci genes

Remove genes in APOE and HLA loci and manually add APOE and HLA genes (based on Bellenguez2022):

gvc.genes.apoe_hla <- gvc.genes %>% filter(ensembl_gene_id %in% c("ENSG00000130203", "ENSG00000196735", "ENSG00000179344", "ENSG00000196126"))

gvc.genes.apoe_hla
gvc.minus_apoe_hla <- gvc %>% filter(grouped_loci_gvc != "APOE / TOMM40" & grouped_loci_gvc != "HLA")

gvc.minus_apoe_hla
gvc.genes.minus_apoe_hla <-
  gvc.minus_apoe_hla %>%
  arrange(absolute_distance) %>%
  distinct(ensembl_gene_id, .keep_all = TRUE) %>%
  select(ensembl_gene_id, gene_symbol, absolute_distance) %>%
  bind_rows(gvc.genes.apoe_hla) %>%
  arrange(gene_symbol)

gvc.genes.minus_apoe_hla

Agora genes

Alzheimer’s disease gene prioritization scores from Agora (see also related journal article):

ago1 <- read_json("agora.syn25741025.overall_scores.v12.2024-10-24.json", simplifyVector = TRUE) %>% as_tibble()

ago1

Alzheimer’s disease genes (Agora nominated targets):

https://agora.adknowledgeportal.org/genes/nominated-targets

ago2 <- read_csv("agora.nominated-targets.gene-list.2024-10-24.csv")
ago2
ago <- ago1 # %>% filter(hgnc_symbol %in% ago2$`Gene Symbol`)

ago

Genetics score

ago %>%
  drop_na(genetics_score) %>%
  gf_density(~ genetics_score, fill = "red", bins = 20) %>% gf_labs(title = "Agora genetics_score")

gmm_fit <- Mclust(ago %>% drop_na(genetics_score) %>% pull(genetics_score), G = 1:6)  # 1 to 6 components
gmm_fit$parameters$mean
     1      2      3      4      5      6 
0.1777 0.7411 1.1681 1.5375 1.9040 2.3111 
sqrt(gmm_fit$parameters$variance$sigmasq)
[1] 0.1687

Multi-omics score

ago %>% drop_na(multi_omics_score) %>% gf_density(~ multi_omics_score, fill = "green", bins = 20) %>% gf_labs(title = "Agora multi_omics_score")

gmm_fit <- Mclust(ago %>% drop_na(multi_omics_score) %>% pull(multi_omics_score), G = 1:6)  # 1 to 6 components
gmm_fit$parameters$mean
       1        2        3        4        5        6 
0.004475 0.432376 0.781154 1.128750 1.457576 1.899564 
sqrt(gmm_fit$parameters$variance$sigmasq)
[1] 0.07474

Target risk score

ago %>% drop_na(target_risk_score) %>% gf_density(~ target_risk_score, fill = "blue", bins = 20) %>% gf_labs(title = "Agora target_risk_score")

gmm_fit <- Mclust(ago %>% drop_na(target_risk_score) %>% pull(target_risk_score), G = 1:6)  # 1 to 6 components
gmm_fit$parameters$mean
     1      2      3      4      5      6 
0.9669 0.7200 1.3287 2.0231 2.8133 3.4076 
sqrt(gmm_fit$parameters$variance$sigmasq)
[1] 0.5048 0.1405 0.3385 0.3090 0.4014 0.4202

OpenTargets genes

Alzheimer’s disease

Alzheimer’s disease gene prioritization scores from OpenTargets:

ot <- read_tsv("OT-MONDO_0004975-associated-targets-6_4_2024-v24_03.tsv", show_col_types = FALSE, na = "No data")

# ot <- read_tsv("OT-MONDO_0004975-associated-targets-10_24_2024-v24_09.tsv", show_col_types = FALSE, na = "No data")

ot

Add Ensembl Gene IDs (WTF!):

otcols <- colnames(ot)
otensg <- gconvert(
  query = ot$symbol,
  organism = "hsapiens",
  target= "ENSG",
  mthreshold = Inf,
  filter_na = TRUE) %>% 
  mutate(input_number = as.character(input_number)) %>%
  left_join(ot %>% rownames_to_column(var = "input_number"), by = "input_number") %>% 
  select(ensembl_gene_id = target, otcols)

otensg

Parkinson’s disease

Parkinson’s disease gene prioritization scores from OpenTargets:

ot.pd <- read_tsv("OT-MONDO_0005180-associated-targets-1_9_2025-v24_09.tsv", show_col_types = FALSE, na = "No data")

ot.pd

Add Ensembl Gene IDs (WTF!):

otcols.pd <- colnames(ot.pd)
otensg.pd <- gconvert(
  query = ot.pd$symbol,
  organism = "hsapiens",
  target= "ENSG",
  mthreshold = Inf,
  filter_na = TRUE) %>% 
  mutate(input_number = as.character(input_number)) %>%
  left_join(ot.pd %>% rownames_to_column(var = "input_number"), by = "input_number") %>% 
  select(ensembl_gene_id = target, otcols.pd)

otensg.pd

Myocardial infarction

Myocardial infarction gene prioritization scores from OpenTargets:

ot.mi <- read_tsv("OT-EFO_0000612-associated-targets-1_9_2025-v24_09.tsv", show_col_types = FALSE, na = "No data")

ot.mi

Add Ensembl Gene IDs (WTF!):

otcols.mi <- colnames(ot.mi)
otensg.mi <- gconvert(
  query = ot.mi$symbol,
  organism = "hsapiens",
  target= "ENSG",
  mthreshold = Inf,
  filter_na = TRUE) %>% 
  mutate(input_number = as.character(input_number)) %>%
  left_join(ot.mi %>% rownames_to_column(var = "input_number"), by = "input_number") %>% 
  select(ensembl_gene_id = target, otcols.mi)

otensg.mi

All protein coding genes in the human genome

gencode <-
  data.frame(rtracklayer::import("gencode.v47.basic.annotation.gtf.gz")) %>%
  distinct(gene_id, .keep_all = TRUE) %>%
  filter(gene_type == "protein_coding") %>%
  mutate(ensembl_gene_id = str_remove(gene_id, pattern = "\\.\\d+$")) %>%
  select(ensembl_gene_id, gene_name) %>%
  slice_sample(n = nrow(.), replace = FALSE) # permute rows

gencode %>% write_tsv("alt/gorilla/gencode.v47.basic.annotation.genes.protein_coding.tsv")

ORA of GVC genes

GVC genes (within 1Mb flanking regions of GVC loci)

Important

unordered query

d0 <- gvc.genes %>% select(ensembl_gene_id, gene_symbol)

d0
query <- d0 %>% distinct(ensembl_gene_id) %>% pull(ensembl_gene_id)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = FALSE, # <- UNORDERED QUERY!
  significant = TRUE,
  user_threshold = 0.005,
  sources = c("GO:BP", "KEGG", "REAC", "WP", "HP"),
  correction_method = "gSCS")

gostres$result %>%
  select(term_name, term_id, source, everything()) %>%
  filter(term_size >= 5, term_size <= 350, intersection_size >= 3)
gostplot(gostres, capped = FALSE, interactive = TRUE)

GVC genes (within 1Mb flanking regions of GVC loci) minus APOE and HLA loci genes

Important

unordered query

d0.minus_apoe_hla <- gvc.genes.minus_apoe_hla %>% select(ensembl_gene_id, gene_symbol)

d0.minus_apoe_hla
query <- d0.minus_apoe_hla %>% distinct(ensembl_gene_id) %>% pull(ensembl_gene_id)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = FALSE, # <- UNORDERED QUERY!
  significant = TRUE,
  user_threshold = 0.005,
  sources = c("GO:BP", "KEGG", "REAC", "WP", "HP"),
  correction_method = "gSCS")

gostres$result %>%
  select(term_name, term_id, source, everything()) %>%
  filter(term_size >= 5, term_size <= 350, intersection_size >= 3)
gostplot(gostres, capped = FALSE, interactive = TRUE)

GVC genes (within 200Kb flanking regions of GVC loci) minus APOE and HLA loci genes

Important

unordered query

d0.minus_apoe_hla.200kb <- gvc.genes.minus_apoe_hla %>% filter(absolute_distance <= 200000) %>% select(ensembl_gene_id, gene_symbol)

d0.minus_apoe_hla.200kb
query <- d0.minus_apoe_hla.200kb %>% distinct(ensembl_gene_id) %>% pull(ensembl_gene_id)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = FALSE, # <- UNORDERED QUERY!
  significant = TRUE,
  user_threshold = 0.005,
  sources = c("GO:BP", "KEGG", "REAC", "WP", "HP"),
  correction_method = "gSCS")

gostres$result %>%
  select(term_name, term_id, source, everything()) %>%
  filter(term_size >= 5, term_size <= 350, intersection_size >= 3)
gostplot(gostres, capped = FALSE, interactive = TRUE)

GVC genes (within 20Kb flanking regions of GVC loci) minus APOE and HLA loci genes

Important

unordered query

d0.minus_apoe_hla.20kb <- gvc.genes.minus_apoe_hla %>% filter(absolute_distance <= 20000) %>% select(ensembl_gene_id, gene_symbol)

d0.minus_apoe_hla.20kb
query <- d0.minus_apoe_hla.20kb %>% distinct(ensembl_gene_id) %>% pull(ensembl_gene_id)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = FALSE, # <- UNORDERED QUERY!
  significant = TRUE,
  user_threshold = 0.005,
  sources = c("GO:BP", "KEGG", "REAC", "WP", "HP"),
  correction_method = "gSCS")

gostres$result %>%
  select(term_name, term_id, source, everything()) %>%
  filter(term_size >= 5, term_size <= 350, intersection_size >= 3)
gostplot(gostres, capped = FALSE, interactive = TRUE)

GVC genes (within 1Mb flanking regions of GVC loci) minus APOE and HLA loci genes, ordered by absolute distance from GVC loci

Important

query ordered by absolute distance

d0.minus_apoe_hla <- gvc.genes.minus_apoe_hla %>% arrange(absolute_distance) %>% select(ensembl_gene_id, gene_symbol, absolute_distance)

d0.minus_apoe_hla
query <- d0.minus_apoe_hla %>% distinct(ensembl_gene_id, .keep_all = TRUE) %>% pull(ensembl_gene_id)

writeLines(query, "alt/enrichmentmap/gvc.genes.minus_apoe_hla.absolute_distance.query.txt")

multiquery <- c("> gvc.genes.minus_apoe_hla.absolute_distance", query)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = TRUE, # <- ORDERED QUERY!
  significant = TRUE,
  user_threshold = 0.005,
  sources = c("GO:BP", "KEGG", "REAC", "WP", "HP"),
  correction_method = "gSCS")

gostres$result %>%
  select(term_name, term_id, source, everything()) %>%
  filter(term_size >= 5, term_size <= 350, intersection_size >= 3)
gostplot(gostres, capped = FALSE, interactive = TRUE)

ORA of Agora genes

Agora genes sorted by genetics_score

d5 <- ago %>%
  drop_na(genetics_score) %>%
  filter(genetics_score >= quantile(genetics_score, 0.90)) %>%
  sample_frac(1L) %>% # randomize row order before arranging
  arrange(desc(genetics_score))

d5
query <- d5 %>% distinct(ensembl_gene_id) %>% pull(ensembl_gene_id)

writeLines(query, "alt/enrichmentmap/agora.genes.genetics_score.query.txt")

multiquery <- c(multiquery, "> agora.genes.genetics_score", query)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = TRUE,
  significant = TRUE,
  user_threshold = 0.005,
  sources = c("GO:BP", "KEGG", "REAC", "WP", "HP"),
  correction_method = "gSCS")

gostres$result %>%
  select(term_name, term_id, source, everything()) %>%
  filter(term_size >= 5, term_size <= 350, intersection_size >= 3)
gostplot(gostres, capped = FALSE, interactive = TRUE)

Agora genes sorted by multi_omics_score

d6 <- ago %>%
  drop_na(multi_omics_score) %>%
  filter(multi_omics_score >= quantile(multi_omics_score, 0.90)) %>%
  sample_frac(1L) %>% # randomize row order before arranging
  arrange(desc(multi_omics_score))

d6
query <- d6 %>% distinct(ensembl_gene_id) %>% pull(ensembl_gene_id)

writeLines(query, "alt/enrichmentmap/agora.genes.multi_omics_score.query.txt")

multiquery <- c(multiquery, "> agora.genes.multi_omics_score", query)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = TRUE,
  significant = TRUE,
  user_threshold = 0.005,
  sources = c("GO:BP", "KEGG", "REAC", "WP", "HP"),
  correction_method = "gSCS")

gostres$result %>%
  select(term_name, term_id, source, everything()) %>%
  filter(term_size >= 5, term_size <= 350, intersection_size >= 3)
gostplot(gostres, capped = FALSE, interactive = TRUE)

Agora genes sorted by target_risk_score

d7 <- ago %>%
  drop_na(target_risk_score) %>%
  filter(target_risk_score >= quantile(target_risk_score, 0.90)) %>%
  sample_frac(1L) %>% # randomize row order before arranging
  arrange(desc(target_risk_score))

d7
query <- d7 %>% distinct(ensembl_gene_id, .keep_all = TRUE) %>% pull(ensembl_gene_id)

writeLines(query, "alt/enrichmentmap/agora.genes.target_risk_score.query.txt")

multiquery <- c(multiquery, "> agora.genes.target_risk_score", query)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = TRUE,
  significant = TRUE,
  user_threshold = 0.005,
  sources = c("GO:BP", "KEGG", "REAC", "WP", "HP"),
  correction_method = "gSCS")

gostres$result %>%
  select(term_name, term_id, source, everything()) %>%
  filter(term_size >= 5, term_size <= 350, intersection_size >= 3)
gostplot(gostres, capped = FALSE, interactive = TRUE)

ORA of OpenTargets genes

Alzheimer’s disease

OpenTargets genes sorted by otGeneticsPortal

d8 <- otensg %>%
  drop_na(otGeneticsPortal) %>%
  sample_frac(1L) %>% # randomize row order before arranging
  arrange(desc(otGeneticsPortal))

d8
query <- d8 %>% distinct(ensembl_gene_id, .keep_all = TRUE) %>% pull(ensembl_gene_id)

writeLines(query, "alt/enrichmentmap/opentargets.genes.genetics_score.query.txt")

multiquery <- c(multiquery, "> opentargets.genes.genetics_score", query)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = TRUE,
  significant = TRUE,
  user_threshold = 0.005,
  sources = c("GO:BP", "KEGG", "REAC", "WP", "HP"),
  correction_method = "gSCS")

gostres$result %>%
  select(term_name, term_id, source, everything()) %>%
  filter(term_size >= 5, term_size <= 350, intersection_size >= 3)
gostplot(gostres, capped = FALSE, interactive = TRUE)

OpenTargets genes sorted by globalScore

d9 <- otensg %>%
  drop_na(globalScore) %>%
  filter(globalScore >= quantile(globalScore, 0.90)) %>%
  sample_frac(1L) %>% # randomize row order before arranging
  arrange(desc(globalScore))

d9
query <- d9 %>% distinct(ensembl_gene_id, .keep_all = TRUE) %>% pull(ensembl_gene_id)

writeLines(query, "alt/enrichmentmap/opentargets.genes.global_score.query.txt")

multiquery <- c(multiquery, "> opentargets.genes.global_score", query)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = TRUE,
  significant = TRUE,
  user_threshold = 0.005,
  sources = c("GO:BP", "KEGG", "REAC", "WP", "HP"),
  correction_method = "gSCS")

gostres$result %>%
  select(term_name, term_id, source, everything()) %>%
  filter(term_size >= 5, term_size <= 350, intersection_size >= 3)
gostplot(gostres, capped = FALSE, interactive = TRUE)

Parkinson’s disease

OpenTargets genes sorted by otGeneticsPortal

d8.pd <- otensg.pd %>%
  drop_na(otGeneticsPortal) %>%
  sample_frac(1L) %>% # randomize row order before arranging
  arrange(desc(otGeneticsPortal))

d8.pd
query <- d8.pd %>% distinct(ensembl_gene_id, .keep_all = TRUE) %>% pull(ensembl_gene_id)

writeLines(query, "alt/enrichmentmap/opentargets.pd.genes.genetics_score.query.txt")

multiquery <- c(multiquery, "> opentargets.pd.genes.genetics_score", query)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = TRUE,
  significant = TRUE,
  user_threshold = 0.005,
  sources = c("GO:BP", "KEGG", "REAC", "WP", "HP"),
  correction_method = "gSCS")

gostres$result %>%
  select(term_name, term_id, source, everything()) %>%
  filter(term_size >= 5, term_size <= 350, intersection_size >= 3)
gostplot(gostres, capped = FALSE, interactive = TRUE)

OpenTargets genes sorted by globalScore

d9.pd <- otensg.pd %>%
  drop_na(globalScore) %>%
  filter(globalScore >= quantile(globalScore, 0.90)) %>%
  sample_frac(1L) %>% # randomize row order before arranging
  arrange(desc(globalScore))

d9.pd
query <- d9.pd %>% distinct(ensembl_gene_id, .keep_all = TRUE) %>% pull(ensembl_gene_id)

writeLines(query, "alt/enrichmentmap/opentargets.pd.genes.global_score.query.txt")

multiquery <- c(multiquery, "> opentargets.pd.genes.global_score", query)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = TRUE,
  significant = TRUE,
  user_threshold = 0.005,
  sources = c("GO:BP", "KEGG", "REAC", "WP", "HP"),
  correction_method = "gSCS")

gostres$result %>%
  select(term_name, term_id, source, everything()) %>%
  filter(term_size >= 5, term_size <= 350, intersection_size >= 3)
gostplot(gostres, capped = FALSE, interactive = TRUE)

Myocardial infarction

OpenTargets genes sorted by otGeneticsPortal

d8.mi <- otensg.mi %>%
  drop_na(otGeneticsPortal) %>%
  sample_frac(1L) %>% # randomize row order before arranging
  arrange(desc(otGeneticsPortal))

d8.mi
query <- d8.mi %>% distinct(ensembl_gene_id, .keep_all = TRUE) %>% pull(ensembl_gene_id)

writeLines(query, "alt/enrichmentmap/opentargets.mi.genes.genetics_score.query.txt")

multiquery <- c(multiquery, "> opentargets.mi.genes.genetics_score", query)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = TRUE,
  significant = TRUE,
  user_threshold = 0.005,
  sources = c("GO:BP", "KEGG", "REAC", "WP", "HP"),
  correction_method = "gSCS")

gostres$result %>%
  select(term_name, term_id, source, everything()) %>%
  filter(term_size >= 5, term_size <= 350, intersection_size >= 3)
gostplot(gostres, capped = FALSE, interactive = TRUE)

OpenTargets genes sorted by globalScore

d9.mi <- otensg.mi %>%
  drop_na(globalScore) %>%
  filter(globalScore >= quantile(globalScore, 0.90)) %>%
  sample_frac(1L) %>% # randomize row order before arranging
  arrange(desc(globalScore))

d9.mi
query <- d9.mi %>% distinct(ensembl_gene_id, .keep_all = TRUE) %>% pull(ensembl_gene_id)

writeLines(query, "alt/enrichmentmap/opentargets.mi.genes.global_score.query.txt")

multiquery <- c(multiquery, "> opentargets.mi.genes.global_score", query)

gostres <- gost(
  query = query,
  organism = "hsapiens",
  domain_scope = "annotated",
  exclude_iea = TRUE,
  ordered_query = TRUE,
  significant = TRUE,
  user_threshold = 0.005,
  sources = c("GO:BP", "KEGG", "REAC", "WP", "HP"),
  correction_method = "gSCS")

gostres$result %>%
  select(term_name, term_id, source, everything()) %>%
  filter(term_size >= 5, term_size <= 350, intersection_size >= 3)
gostplot(gostres, capped = FALSE, interactive = TRUE)

Write multiquery to file for later use in enrichmentmap

writeLines(multiquery, "alt/enrichmentmap/multiquery.txt")